Problem 1

A

howell <- read.table("http://hompal-stats.wabarr.com/datasets/Howell_craniometry.csv", header=TRUE, sep=",")
library(dplyr)
howell <- howell %>% dplyr::select(ID, Sex, Population, BNL, MDH, EKB, ZOR, BAA, NBA)
head(howell)
##   ID Sex Population BNL MDH EKB ZOR BAA NBA
## 1  1   M      NORSE 100  31 100  81  39  76
## 2  2   M      NORSE 102  19  96  84  35  79
## 3  3   M      NORSE 102  28  97  82  38  72
## 4  4   M      NORSE 100  25  99  79  46  75
## 5  5   M      NORSE  97  26  97  79  42  80
## 6  6   M      NORSE 106  29  98  88  39  77

B

pairs(howell[,4:9])

C

cov(howell[,4:9])
##            BNL        MDH        EKB       ZOR        BAA        NBA
## BNL 33.7394473  8.8005539 14.2232895 22.304205 -2.2056656 -0.8760075
## MDH  8.8005539 14.9809962  6.3351161  5.002197  3.0393233  0.4381316
## EKB 14.2232895  6.3351161 17.9696276 12.515685  0.1239454 -1.8134923
## ZOR 22.3042053  5.0021967 12.5156847 24.287058 -1.6050285 -3.1503620
## BAA -2.2056656  3.0393233  0.1239454 -1.605029 10.2812444  0.6143717
## NBA -0.8760075  0.4381316 -1.8134923 -3.150362  0.6143717 11.2139906
#gives the same results
var(howell[,4:9])
##            BNL        MDH        EKB       ZOR        BAA        NBA
## BNL 33.7394473  8.8005539 14.2232895 22.304205 -2.2056656 -0.8760075
## MDH  8.8005539 14.9809962  6.3351161  5.002197  3.0393233  0.4381316
## EKB 14.2232895  6.3351161 17.9696276 12.515685  0.1239454 -1.8134923
## ZOR 22.3042053  5.0021967 12.5156847 24.287058 -1.6050285 -3.1503620
## BAA -2.2056656  3.0393233  0.1239454 -1.605029 10.2812444  0.6143717
## NBA -0.8760075  0.4381316 -1.8134923 -3.150362  0.6143717 11.2139906

Problem 2

A

distMat <- dist(x = howell[,4:9], method = "euclid")

B

HowellClust <- hclust(d = distMat, method = "average")

C

plot(HowellClust)

Problem 3

A

detach("package:dplyr", unload=TRUE)
library(MASS)
howellDFA <- lda(Population ~ BNL + MDH + EKB + ZOR + BAA + NBA, data=howell)
howellDFA
## Call:
## lda(Population ~ BNL + MDH + EKB + ZOR + BAA + NBA, data = howell)
## 
## Prior probabilities of groups:
##        AINU     ANDAMAN      ANYANG     ARIKARA      ATAYAL    AUSTRALI 
## 0.034072900 0.027733756 0.016640254 0.027337559 0.018621236 0.040015848 
##        BERG      BURIAT     BUSHMAN       DOGON    EASTER I       EGYPT 
## 0.043185420 0.043185420 0.035657686 0.039223455 0.034072900 0.043977813 
##      ESKIMO        GUAM      HAINAN      MOKAPU     MORIORI     N JAPAN 
## 0.042789223 0.022583201 0.032884311 0.039619651 0.042789223 0.034469097 
##     N MAORI       NORSE        PERU    PHILLIPI     S JAPAN     S MAORI 
## 0.003961965 0.043581616 0.043581616 0.019809826 0.036053883 0.003961965 
##    SANTA CR    TASMANIA       TEITA       TOLAI     ZALAVAR        ZULU 
## 0.040412044 0.034469097 0.032884311 0.043581616 0.038827258 0.040015848 
## 
## Group means:
##                BNL      MDH       EKB      ZOR      BAA      NBA
## AINU     103.65116 27.65116 100.36047 81.19767 37.59302 78.50000
## ANDAMAN   91.64286 24.17143  91.85714 72.65714 37.17143 80.15714
## ANYANG   101.28571 30.59524  98.78571 81.09524 40.76190 81.19048
## ARIKARA  100.75362 26.92754  97.91304 79.65217 41.37681 77.59420
## ATAYAL    96.38298 25.59574  95.19149 76.17021 38.91489 80.40426
## AUSTRALI  99.15842 27.37624  99.82178 81.00000 36.20792 74.76238
## BERG      95.83486 27.00000  96.99083 78.09174 40.94495 76.80734
## BURIAT    99.30275 27.74312  99.55963 84.33945 43.03670 75.69725
## BUSHMAN   93.07778 23.26667  95.32222 77.54444 35.65556 74.02222
## DOGON     96.58586 27.02020  97.12121 76.94949 37.75758 78.76768
## EASTER I 108.34884 27.59302  97.95349 85.61628 35.88372 79.31395
## EGYPT     98.81982 27.81982  94.02703 77.45045 40.13514 77.21622
## ESKIMO   103.31481 25.25000  98.41667 86.06481 39.65741 78.53704
## GUAM     102.63158 29.10526  99.57895 81.52632 39.57895 80.87719
## HAINAN    97.30120 27.54217  96.14458 78.45783 41.09639 81.27711
## MOKAPU   104.28000 28.29000  98.00000 82.71000 37.34000 79.65000
## MORIORI  103.59259 29.87037  98.82407 81.62037 40.87963 76.98148
## N JAPAN   99.33333 29.20690  97.25287 79.70115 41.41379 80.00000
## N MAORI  105.00000 32.80000  99.40000 83.20000 38.00000 77.80000
## NORSE     99.55455 27.78182  96.87273 80.03636 39.88182 75.45455
## PERU      93.29091 28.13636  93.11818 72.78182 41.55455 78.76364
## PHILLIPI  98.48000 29.62000  98.36000 79.46000 39.76000 79.86000
## S JAPAN   99.09890 27.89011  95.75824 79.56044 40.26374 80.80220
## S MAORI  105.60000 31.70000 100.20000 83.80000 39.70000 77.70000
## SANTA CR  95.04902 27.02941  97.01961 76.53922 40.63725 77.19608
## TASMANIA  97.34483 24.59770  98.51724 78.72414 35.37931 77.79310
## TEITA     98.72289 26.19277  97.55422 81.08434 37.16867 75.93976
## TOLAI     98.60000 27.45455  98.76364 81.85455 36.87273 79.70000
## ZALAVAR   99.09184 28.04082  96.56122 78.63265 39.78571 78.03061
## ZULU      99.85149 27.13861  99.45545 79.54455 38.14851 77.05941
## 
## Coefficients of linear discriminants:
##              LD1          LD2         LD3          LD4          LD5
## BNL -0.146996168 -0.006578434  0.15602185  0.251366265  0.008445963
## MDH  0.001150378 -0.079098956  0.06847894  0.003796849 -0.089985789
## EKB  0.131146953  0.061956012 -0.05293233 -0.066234722 -0.271118512
## ZOR -0.123586303  0.160200307 -0.23079779 -0.201077715  0.119939453
## BAA -0.250462354 -0.270173233 -0.17932627  0.005994658 -0.002678732
## NBA -0.183717324  0.061478572  0.18034978 -0.263888987 -0.021081306
##            LD6
## BNL  0.1230614
## MDH -0.2913560
## EKB  0.1378057
## ZOR -0.1514408
## BAA  0.1275435
## NBA  0.0360990
## 
## Proportion of trace:
##    LD1    LD2    LD3    LD4    LD5    LD6 
## 0.3472 0.2690 0.1871 0.1097 0.0583 0.0287

B

predictions <- predict(howellDFA, newdata = howell[,4:9])
sum(predictions$class == howell$Population) / nrow(howell)
## [1] 0.3098257

C

library(ggplot2)
qplot(x=LD1, y=LD2, color = howell$Population, data=as.data.frame(predictions$x), size=I(5)) + theme_bw(30)

D

sum(predictions$class == howell$Population) / length(predictions$class)
## [1] 0.3098257

E

It depends on how you look at it. It the ~30% correct classification rate isn’t great, but we would only expect about 1/30 = 3.3333333% by chance, so it isn’t as bad as it seems.